import pandas as pd
import numpy as np
import seaborn as sn
import plotly.express as px
import plotly.offline as pyo
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns',None)
import os
df = pd.read_csv('Chennai houseing sale.csv')
df.head()
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P03210 | Karapakkam | 1004 | 04-05-2011 | 131 | 1.0 | 1.0 | 3 | AbNormal | Yes | 15-05-1967 | Commercial | AllPub | Paved | A | 4.0 | 3.9 | 4.9 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | P09411 | Anna Nagar | 1986 | 19-12-2006 | 26 | 2.0 | 1.0 | 5 | AbNormal | No | 22-12-1995 | Commercial | AllPub | Gravel | RH | 4.9 | 4.2 | 2.5 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | P01812 | Adyar | 909 | 04-02-2012 | 70 | 1.0 | 1.0 | 3 | AbNormal | Yes | 09-02-1992 | Commercial | ELO | Gravel | RL | 4.1 | 3.8 | 2.2 | 3.090 | 421094 | 92114 | 13159200 |
| 3 | P05346 | Velachery | 1855 | 13-03-2010 | 14 | 3.0 | 2.0 | 5 | Family | No | 18-03-1988 | Others | NoSewr | Paved | I | 4.7 | 3.9 | 3.6 | 4.010 | 356321 | 77042 | 9630290 |
| 4 | P06210 | Karapakkam | 1226 | 05-10-2009 | 84 | 1.0 | 1.0 | 3 | AbNormal | Yes | 13-10-1979 | Others | AllPub | Gravel | C | 3.0 | 2.5 | 4.1 | 3.290 | 237000 | 74063 | 7406250 |
df.shape
(7109, 22)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 7109 entries, 0 to 7108 Data columns (total 22 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 PRT_ID 7109 non-null object 1 AREA 7109 non-null object 2 INT_SQFT 7109 non-null int64 3 DATE_SALE 7109 non-null object 4 DIST_MAINROAD 7109 non-null int64 5 N_BEDROOM 7108 non-null float64 6 N_BATHROOM 7104 non-null float64 7 N_ROOM 7109 non-null int64 8 SALE_COND 7109 non-null object 9 PARK_FACIL 7109 non-null object 10 DATE_BUILD 7109 non-null object 11 BUILDTYPE 7109 non-null object 12 UTILITY_AVAIL 7109 non-null object 13 STREET 7109 non-null object 14 MZZONE 7109 non-null object 15 QS_ROOMS 7109 non-null float64 16 QS_BATHROOM 7109 non-null float64 17 QS_BEDROOM 7109 non-null float64 18 QS_OVERALL 7061 non-null float64 19 REG_FEE 7109 non-null int64 20 COMMIS 7109 non-null int64 21 SALES_PRICE 7109 non-null int64 dtypes: float64(6), int64(6), object(10) memory usage: 1.2+ MB
df.isnull().sum()
PRT_ID 0 AREA 0 INT_SQFT 0 DATE_SALE 0 DIST_MAINROAD 0 N_BEDROOM 1 N_BATHROOM 5 N_ROOM 0 SALE_COND 0 PARK_FACIL 0 DATE_BUILD 0 BUILDTYPE 0 UTILITY_AVAIL 0 STREET 0 MZZONE 0 QS_ROOMS 0 QS_BATHROOM 0 QS_BEDROOM 0 QS_OVERALL 48 REG_FEE 0 COMMIS 0 SALES_PRICE 0 dtype: int64
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(missing_values=np.nan,strategy='mean',fill_value=None)
df['N_BEDROOM'] = imputer.fit_transform(df[['N_BEDROOM']])
df['N_BATHROOM'] = imputer.fit_transform(df[['N_BATHROOM']])
df['QS_OVERALL'] = imputer.fit_transform(df[['QS_OVERALL']])
for col in df.columns:
if df[col].dtype=='object':
print()
print(col)
print(df[col].unique())
PRT_ID ['P03210' 'P09411' 'P01812' ... 'P09594' 'P06508' 'P09794'] AREA ['Karapakkam' 'Anna Nagar' 'Adyar' 'Velachery' 'Chrompet' 'KK Nagar' 'TNagar' 'T Nagar' 'Chrompt' 'Chrmpet' 'Karapakam' 'Ana Nagar' 'Chormpet' 'Adyr' 'Velchery' 'Ann Nagar' 'KKNagar'] DATE_SALE ['04-05-2011' '19-12-2006' '04-02-2012' ... '28-03-2014' '25-08-2006' '13-07-2005'] SALE_COND ['AbNormal' 'Family' 'Partial' 'AdjLand' 'Normal Sale' 'Ab Normal' 'Partiall' 'Adj Land' 'PartiaLl'] PARK_FACIL ['Yes' 'No' 'Noo'] DATE_BUILD ['15-05-1967' '22-12-1995' '09-02-1992' ... '01-09-1978' '11-08-1977' '24-07-1961'] BUILDTYPE ['Commercial' 'Others' 'Other' 'House' 'Comercial'] UTILITY_AVAIL ['AllPub' 'ELO' 'NoSewr ' 'NoSeWa' 'All Pub'] STREET ['Paved' 'Gravel' 'No Access' 'Pavd' 'NoAccess'] MZZONE ['A' 'RH' 'RL' 'I' 'C' 'RM']
df.AREA.replace(['Ana Nagar','Ann Nagar'],'Anna Nagar',inplace=True)
df.AREA.replace('Karapakkam','Karapakam',inplace=True)
df.AREA.replace(['Chrompt','Chrmpet','Chormpet'],'Chrompet',inplace=True)
df.AREA.replace('KKNagar','KK Nagar',inplace=True)
df.AREA.replace('TNagar','T Nagar',inplace=True)
df.AREA.replace('Adyr','Adyar',inplace=True)
df.AREA.replace('Velchery','Velachery',inplace=True)
df.BUILDTYPE.replace('Comercial','Commercial',inplace=True)
df.BUILDTYPE.replace('Other','Others',inplace=True)
df.UTILITY_AVAIL.replace('AllPub','All Pub',inplace=True)
df.UTILITY_AVAIL.replace(['NoSewr','NoSewr'],'NoSeWa',inplace=True)
df.SALE_COND.replace('Ab Normal','AbNormal',inplace=True)
df.SALE_COND.replace(['PartiaLl','Partiall'],'Partial',inplace=True)
df.SALE_COND.replace('Adj Land','AdjLand',inplace=True)
df.PARK_FACIL.replace('Noo','No',inplace=True)
df.STREET.replace('Pavd','Paved',inplace=True)
df.STREET.replace('NoAccess','No Access',inplace=True)
df.QS_ROOMS=df.QS_ROOMS.astype(int)
df.QS_BATHROOM=df.QS_BATHROOM.astype(int)
df.QS_BEDROOM=df.QS_BEDROOM.astype(int)
df.N_BEDROOM=df.N_BEDROOM.astype(int)
df.N_BATHROOM=df.N_BATHROOM.astype(int)
df.head()
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | P03210 | Karapakam | 1004 | 04-05-2011 | 131 | 1 | 1 | 3 | AbNormal | Yes | 15-05-1967 | Commercial | All Pub | Paved | A | 4 | 3 | 4 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | P09411 | Anna Nagar | 1986 | 19-12-2006 | 26 | 2 | 1 | 5 | AbNormal | No | 22-12-1995 | Commercial | All Pub | Gravel | RH | 4 | 4 | 2 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | P01812 | Adyar | 909 | 04-02-2012 | 70 | 1 | 1 | 3 | AbNormal | Yes | 09-02-1992 | Commercial | ELO | Gravel | RL | 4 | 3 | 2 | 3.090 | 421094 | 92114 | 13159200 |
| 3 | P05346 | Velachery | 1855 | 13-03-2010 | 14 | 3 | 2 | 5 | Family | No | 18-03-1988 | Others | NoSewr | Paved | I | 4 | 3 | 3 | 4.010 | 356321 | 77042 | 9630290 |
| 4 | P06210 | Karapakam | 1226 | 05-10-2009 | 84 | 1 | 1 | 3 | AbNormal | Yes | 13-10-1979 | Others | All Pub | Gravel | C | 3 | 2 | 4 | 3.290 | 237000 | 74063 | 7406250 |
df.describe()
| INT_SQFT | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7109.000000 | 7.109000e+03 |
| mean | 1382.073006 | 99.603179 | 1.636939 | 1.213110 | 3.688704 | 3.075538 | 3.069349 | 3.056126 | 3.503254 | 376938.330708 | 141005.726544 | 1.089491e+07 |
| std | 457.410902 | 57.403110 | 0.802881 | 0.409534 | 1.019099 | 0.872937 | 0.876024 | 0.871499 | 0.525440 | 143070.662010 | 78768.093718 | 3.768603e+06 |
| min | 500.000000 | 0.000000 | 1.000000 | 1.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 2.000000 | 71177.000000 | 5055.000000 | 2.156875e+06 |
| 25% | 993.000000 | 50.000000 | 1.000000 | 1.000000 | 3.000000 | 2.000000 | 2.000000 | 2.000000 | 3.130000 | 272406.000000 | 84219.000000 | 8.272100e+06 |
| 50% | 1373.000000 | 99.000000 | 1.000000 | 1.000000 | 4.000000 | 3.000000 | 3.000000 | 3.000000 | 3.503254 | 349486.000000 | 127628.000000 | 1.033505e+07 |
| 75% | 1744.000000 | 148.000000 | 2.000000 | 1.000000 | 4.000000 | 4.000000 | 4.000000 | 4.000000 | 3.880000 | 451562.000000 | 184506.000000 | 1.299390e+07 |
| max | 2500.000000 | 200.000000 | 4.000000 | 2.000000 | 6.000000 | 5.000000 | 5.000000 | 5.000000 | 4.970000 | 983922.000000 | 495405.000000 | 2.366734e+07 |
df.AREA.value_counts()
Chrompet 1702 Karapakam 1366 KK Nagar 997 Velachery 981 Anna Nagar 788 Adyar 774 T Nagar 501 Name: AREA, dtype: int64
df.groupby('AREA',as_index=False).count()
| AREA | PRT_ID | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Adyar | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 | 774 |
| 1 | Anna Nagar | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 | 788 |
| 2 | Chrompet | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 | 1702 |
| 3 | KK Nagar | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 | 997 |
| 4 | Karapakam | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 | 1366 |
| 5 | T Nagar | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 | 501 |
| 6 | Velachery | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 | 981 |
fig = px.pie(df.groupby('AREA',as_index=False).count(),
names='AREA',
values='PRT_ID',
#labels={'PRT_ID':'Count'},
template='plotly_dark',
color_discrete_sequence=px.colors.sequential.Plasma,
hole=.5,
title='<b> Houses Count in different Areas of Chennai<b>'
)
fig.show()
pyo.plot(fig, filename = 'Houses Count in different Areas of Chennai.html', auto_open = False)
'Houses Count in different Areas of Chennai.html'
fig = px.box(df,
x='AREA',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b> Sales Price of Houses in different Areas')
fig.show()
pyo.plot(fig, filename = 'Sales Price of Houses in different Areas.html', auto_open = False)
'Sales Price of Houses in different Areas.html'
fig = px.scatter(df,
x='INT_SQFT',
y='SALES_PRICE',
color='AREA',
size='INT_SQFT',
labels={'INT_SQFT':'SQFT'},
template='plotly_dark',
title='<b> SQFT Versus SALES PRICE of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'SQFT Versus SALES PRICE of Houses in Different Areas.html', auto_open = False)
'SQFT Versus SALES PRICE of Houses in Different Areas.html'
fig = px.scatter(df,
x='DIST_MAINROAD',
y='SALES_PRICE',
color='AREA',
size='SALES_PRICE',
template='plotly_dark',
title='<b> MAINROAD DISTANCE Vs SALES_PRICE')
fig.show()
pyo.plot(fig, filename = 'MAINROAD DISTANCE Vs SALES_PRICE.html', auto_open = False)
'MAINROAD DISTANCE Vs SALES_PRICE.html'
fig =px.scatter(df,
x='DATE_BUILD',
y='SALES_PRICE',
color='AREA',
size='SALES_PRICE',
template='plotly_dark',
title='<b> Build Date Vs Sales Price')
fig.show()
pyo.plot(fig, filename = 'Build Date Vs Sales Price.html', auto_open = False)
fig=px.scatter(df,
x='DATE_SALE',
y='SALES_PRICE',
color='AREA',
size='SALES_PRICE',
template='plotly_dark',
title='<b> Sale Date Vs Sales Price')
fig.show()
pyo.plot(fig, filename = 'Sale Date Vs Sales Price.html', auto_open = False)
'Sale Date Vs Sales Price.html'
fig = px.box(df,
x='N_ROOM',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b>Total Rooms Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Total Rooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Total Rooms Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='N_BEDROOM',
#y='AREA',
y='SALES_PRICE',
color = 'AREA',
template='plotly_dark',
title='<b>Total BedRooms Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Total BedRooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Total BedRooms Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='N_BATHROOM',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b>Total BathRooms Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Total BedRooms Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Total BedRooms Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='SALE_COND',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b>Sale Condition Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Sale Condition Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Sale Condition Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='PARK_FACIL',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b>Parking Facility Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Parking Facility Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Parking Facility Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='BUILDTYPE',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b>Build Type Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Build Type Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Build Type Versus Sales Price of Houses in Different Areas.html'
fig = px.box(df,
x='STREET',
y='SALES_PRICE',
color='AREA',
template='plotly_dark',
title='<b> Street Versus Sales Price of Houses in Different Areas')
fig.show()
pyo.plot(fig, filename = 'Street Versus Sales Price of Houses in Different Areas.html', auto_open = False)
'Street Versus Sales Price of Houses in Different Areas.html'
df.corr()
| INT_SQFT | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| INT_SQFT | 1.000000 | 0.002022 | 0.786263 | 0.515290 | 0.951279 | 0.016146 | -0.011463 | 0.005847 | 0.013989 | 0.657544 | 0.571076 | 0.612125 |
| DIST_MAINROAD | 0.002022 | 1.000000 | -0.002723 | 0.001850 | 0.002301 | 0.013050 | -0.021857 | 0.002757 | -0.017255 | 0.011600 | 0.010994 | 0.018783 |
| N_BEDROOM | 0.786263 | -0.002723 | 1.000000 | 0.755207 | 0.840209 | 0.011434 | -0.008603 | 0.014449 | 0.014245 | 0.455380 | 0.430446 | 0.330987 |
| N_BATHROOM | 0.515290 | 0.001850 | 0.755207 | 1.000000 | 0.568541 | 0.010452 | -0.011005 | 0.013390 | 0.007961 | 0.260244 | 0.256726 | 0.108834 |
| N_ROOM | 0.951279 | 0.002301 | 0.840209 | 0.568541 | 1.000000 | 0.013468 | -0.009066 | 0.012072 | 0.015418 | 0.630932 | 0.533343 | 0.602760 |
| QS_ROOMS | 0.016146 | 0.013050 | 0.011434 | 0.010452 | 0.013468 | 1.000000 | 0.001796 | 0.009590 | 0.489271 | 0.014614 | 0.007072 | 0.016059 |
| QS_BATHROOM | -0.011463 | -0.021857 | -0.008603 | -0.011005 | -0.009066 | 0.001796 | 1.000000 | -0.011917 | 0.513585 | -0.014777 | -0.008397 | -0.019469 |
| QS_BEDROOM | 0.005847 | 0.002757 | 0.014449 | 0.013390 | 0.012072 | 0.009590 | -0.011917 | 1.000000 | 0.598154 | 0.018875 | 0.019376 | 0.015044 |
| QS_OVERALL | 0.013989 | -0.017255 | 0.014245 | 0.007961 | 0.015418 | 0.489271 | 0.513585 | 0.598154 | 1.000000 | 0.022410 | 0.016946 | 0.020485 |
| REG_FEE | 0.657544 | 0.011600 | 0.455380 | 0.260244 | 0.630932 | 0.014614 | -0.014777 | 0.018875 | 0.022410 | 1.000000 | 0.659903 | 0.878148 |
| COMMIS | 0.571076 | 0.010994 | 0.430446 | 0.256726 | 0.533343 | 0.007072 | -0.008397 | 0.019376 | 0.016946 | 0.659903 | 1.000000 | 0.626275 |
| SALES_PRICE | 0.612125 | 0.018783 | 0.330987 | 0.108834 | 0.602760 | 0.016059 | -0.019469 | 0.015044 | 0.020485 | 0.878148 | 0.626275 | 1.000000 |
plt.figure(figsize=(12,6))
sn.heatmap(df.corr(),annot=True,cmap='CMRmap')
plt.title('Heatmap of the Data')
plt.show()
# Creating a Back up File
df1=df.copy()
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for col in df.columns:
if df[col].dtype=='object':
df[col]=le.fit_transform(df[col])
df.head()
| PRT_ID | AREA | INT_SQFT | DATE_SALE | DIST_MAINROAD | N_BEDROOM | N_BATHROOM | N_ROOM | SALE_COND | PARK_FACIL | DATE_BUILD | BUILDTYPE | UTILITY_AVAIL | STREET | MZZONE | QS_ROOMS | QS_BATHROOM | QS_BEDROOM | QS_OVERALL | REG_FEE | COMMIS | SALES_PRICE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2266 | 4 | 1004 | 310 | 131 | 1 | 1 | 3 | 0 | 1 | 2731 | 0 | 0 | 2 | 0 | 4 | 3 | 4 | 4.330 | 380000 | 144400 | 7600000 |
| 1 | 6664 | 1 | 1986 | 1745 | 26 | 2 | 1 | 5 | 0 | 0 | 4226 | 0 | 0 | 0 | 3 | 4 | 4 | 2 | 3.765 | 760122 | 304049 | 21717770 |
| 2 | 1270 | 0 | 909 | 288 | 70 | 1 | 1 | 3 | 0 | 1 | 1511 | 0 | 1 | 0 | 4 | 4 | 3 | 2 | 3.090 | 421094 | 92114 | 13159200 |
| 3 | 3755 | 6 | 1855 | 1117 | 14 | 3 | 2 | 5 | 2 | 0 | 3300 | 2 | 3 | 2 | 2 | 4 | 3 | 3 | 4.010 | 356321 | 77042 | 9630290 |
| 4 | 4393 | 4 | 1226 | 442 | 84 | 1 | 1 | 3 | 0 | 1 | 2406 | 2 | 0 | 0 | 1 | 3 | 2 | 4 | 3.290 | 237000 | 74063 | 7406250 |
x=df.drop('SALES_PRICE',axis=1)
y=df['SALES_PRICE']
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape
((5331, 21), (1778, 21), (5331,), (1778,))
from sklearn.preprocessing import MinMaxScaler
mmscaler=MinMaxScaler(feature_range=(0,1))
ms = mmscaler.fit(x_train)
x_train = ms.transform(x_train)
x_test = ms.transform(x_test)
x_train = pd.DataFrame(x_train)
x_test = pd.DataFrame(x_test)
a = {'Model Name':[],
'Mean_Absolute_Error_MAE':[] ,
'Adj_R_Square':[] ,
'Root_Mean_Squared_Error_RMSE':[] ,
'Mean_Absolute_Percentage_Error_MAPE':[] ,
'Mean_Squared_Error_MSE':[] ,
'Root_Mean_Squared_Log_Error_RMSLE':[] ,
'R2_score':[]}
Results=pd.DataFrame(a)
Results.head()
| Model Name | Mean_Absolute_Error_MAE | Adj_R_Square | Root_Mean_Squared_Error_RMSE | Mean_Absolute_Percentage_Error_MAPE | Mean_Squared_Error_MSE | Root_Mean_Squared_Log_Error_RMSLE | R2_score |
|---|
# Import the library
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import xgboost as xgb
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor
lg = LinearRegression()
dt = DecisionTreeRegressor()
rf = RandomForestRegressor()
SVR = SVR()
xgbr = xgb.XGBRegressor()
KNN = KNeighborsRegressor(n_neighbors=5)
ETR = ExtraTreesRegressor()
Model = [lg, dt, rf, SVR,KNN, ETR, xgbr]
model_GBR = GradientBoostingRegressor(loss='ls', learning_rate=0.1, n_estimators=100, subsample=1.0,
criterion='friedman_mse', min_samples_split=2, min_samples_leaf=1,
min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0,
init=None, random_state=None, max_features=None,
alpha=0.9, verbose=0, max_leaf_nodes=None, warm_start=False,
validation_fraction=0.1, n_iter_no_change=None, tol=0.0001, ccp_alpha=0.0)
# Evalution matrix for all the algorithms
for model in Model:
# Fit the model with train data
model.fit(x_train, y_train)
# Predict the model with test data
y_pred = model.predict(x_test)
# Print the model name
print('Model Name: ', model)
# Evaluation metrics for Regression analysis
from sklearn import metrics
print('Mean Absolute Error (MAE):', round(metrics.mean_absolute_error(y_test, y_pred),3))
print('Mean Squared Error (MSE):', round(metrics.mean_squared_error(y_test, y_pred),3))
print('Root Mean Squared Error (RMSE):', round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)),3))
print('R2_score:', round(metrics.r2_score(y_test, y_pred),6))
print('Root Mean Squared Log Error (RMSLE):', round(np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),3))
# Define the function to calculate the MAPE - Mean Absolute Percentage Error
def MAPE (y_test, y_pred):
y_test, y_pred = np.array(y_test), np.array(y_pred)
return np.mean(np.abs((y_test - y_pred) / y_test)) * 100
# Evaluation of MAPE
result = MAPE(y_test, y_pred)
print('Mean Absolute Percentage Error (MAPE):', round(result, 2), '%')
# Calculate Adjusted R squared values
r_squared = round(metrics.r2_score(y_test, y_pred),6)
adjusted_r_squared = round(1 - (1-r_squared)*(len(y)-1)/(len(y)-x.shape[1]-1),6)
print('Adj R Square: ', adjusted_r_squared)
print('******************************************************************************************************************')
#-------------------------------------------------------------------------------------------
new_row = {'Model Name' : model,
'Mean_Absolute_Error_MAE' : metrics.mean_absolute_error(y_test, y_pred),
'Adj_R_Square' : adjusted_r_squared,
'Root_Mean_Squared_Error_RMSE' : np.sqrt(metrics.mean_squared_error(y_test, y_pred)),
'Mean_Absolute_Percentage_Error_MAPE' : result,
'Mean_Squared_Error_MSE' : metrics.mean_squared_error(y_test, y_pred),
'Root_Mean_Squared_Log_Error_RMSLE': np.log(np.sqrt(metrics.mean_squared_error(y_test, y_pred))),
'R2_score' : metrics.r2_score(y_test, y_pred)}
Results = Results.append(new_row, ignore_index=True)
print()
Model Name: LinearRegression()
Mean Absolute Error (MAE): 1022554.549
Mean Squared Error (MSE): 1663603131167.479
Root Mean Squared Error (RMSE): 1289807.401
R2_score: 0.875211
Root Mean Squared Log Error (RMSLE): 14.07
Mean Absolute Percentage Error (MAPE): 10.0 %
Adj R Square: 0.874841
******************************************************************************************************************
Model Name: DecisionTreeRegressor()
Mean Absolute Error (MAE): 766160.228
Mean Squared Error (MSE): 1083848977254.26
Root Mean Squared Error (RMSE): 1041080.678
R2_score: 0.918699
Root Mean Squared Log Error (RMSLE): 13.856
Mean Absolute Percentage Error (MAPE): 7.61 %
Adj R Square: 0.918458
******************************************************************************************************************
Model Name: RandomForestRegressor()
Mean Absolute Error (MAE): 529879.249
Mean Squared Error (MSE): 451763452981.996
Root Mean Squared Error (RMSE): 672133.508
R2_score: 0.966113
Root Mean Squared Log Error (RMSLE): 13.418
Mean Absolute Percentage Error (MAPE): 5.39 %
Adj R Square: 0.966013
******************************************************************************************************************
Model Name: SVR()
Mean Absolute Error (MAE): 2830943.966
Mean Squared Error (MSE): 13621656915965.639
Root Mean Squared Error (RMSE): 3690752.893
R2_score: -0.02178
Root Mean Squared Log Error (RMSLE): 15.121
Mean Absolute Percentage Error (MAPE): 28.67 %
Adj R Square: -0.024808
******************************************************************************************************************
Model Name: KNeighborsRegressor()
Mean Absolute Error (MAE): 1260718.96
Mean Squared Error (MSE): 2848745691648.293
Root Mean Squared Error (RMSE): 1687822.767
R2_score: 0.786312
Root Mean Squared Log Error (RMSLE): 14.339
Mean Absolute Percentage Error (MAPE): 11.92 %
Adj R Square: 0.785679
******************************************************************************************************************
Model Name: ExtraTreesRegressor()
Mean Absolute Error (MAE): 369291.913
Mean Squared Error (MSE): 217005195335.877
Root Mean Squared Error (RMSE): 465838.164
R2_score: 0.983722
Root Mean Squared Log Error (RMSLE): 13.052
Mean Absolute Percentage Error (MAPE): 3.82 %
Adj R Square: 0.983674
******************************************************************************************************************
Model Name: XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
early_stopping_rounds=None, enable_categorical=False,
eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
importance_type=None, interaction_constraints='',
learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
reg_lambda=1, ...)
Mean Absolute Error (MAE): 357348.064
Mean Squared Error (MSE): 213624495167.217
Root Mean Squared Error (RMSE): 462195.3
R2_score: 0.983976
Root Mean Squared Log Error (RMSLE): 13.044
Mean Absolute Percentage Error (MAPE): 3.57 %
Adj R Square: 0.983929
******************************************************************************************************************
Results
| Model Name | Mean_Absolute_Error_MAE | Adj_R_Square | Root_Mean_Squared_Error_RMSE | Mean_Absolute_Percentage_Error_MAPE | Mean_Squared_Error_MSE | Root_Mean_Squared_Log_Error_RMSLE | R2_score | |
|---|---|---|---|---|---|---|---|---|
| 0 | LinearRegression() | 1.022555e+06 | 0.874841 | 1.289807e+06 | 10.004632 | 1.663603e+12 | 14.070003 | 0.875211 |
| 1 | DecisionTreeRegressor() | 7.661602e+05 | 0.918458 | 1.041081e+06 | 7.613756 | 1.083849e+12 | 13.855770 | 0.918699 |
| 2 | (DecisionTreeRegressor(max_features=1.0, rando... | 5.298792e+05 | 0.966013 | 6.721335e+05 | 5.385192 | 4.517635e+11 | 13.418212 | 0.966113 |
| 3 | SVR() | 2.830944e+06 | -0.024808 | 3.690753e+06 | 28.670171 | 1.362166e+13 | 15.121341 | -0.021780 |
| 4 | KNeighborsRegressor() | 1.260719e+06 | 0.785679 | 1.687823e+06 | 11.924981 | 2.848746e+12 | 14.338950 | 0.786312 |
| 5 | (ExtraTreeRegressor(random_state=1390877790), ... | 3.692919e+05 | 0.983674 | 4.658382e+05 | 3.816738 | 2.170052e+11 | 13.051594 | 0.983722 |
| 6 | XGBRegressor(base_score=0.5, booster='gbtree',... | 3.573481e+05 | 0.983929 | 4.621953e+05 | 3.570879 | 2.136245e+11 | 13.043743 | 0.983976 |
# Training the Model
xgbr.fit(x_train, y_train)
# Predict the model with test data
y_pred_xgb = xgbr.predict(x_test)
# Training the Model
ETR.fit(x_train, y_train)
# Predict the model with test data
y_pred_ETR = ETR.predict(x_test)
output= pd.DataFrame({'Price_actual':y_test, 'Price_pred (XGB)':y_pred_xgb, 'Price_pred (ETR)': y_pred_ETR})
result=df1.merge(output,left_index=True,right_index=True)
#result.head()
result[['PRT_ID','AREA','Price_actual','Price_pred (XGB)','Price_pred (ETR)']].sample(20)
| PRT_ID | AREA | Price_actual | Price_pred (XGB) | Price_pred (ETR) | |
|---|---|---|---|---|---|
| 3289 | P04715 | Karapakam | 8553000 | 8648174.0 | 8289827.50 |
| 4220 | P06076 | Velachery | 13685190 | 13847931.0 | 13796077.40 |
| 149 | P07490 | KK Nagar | 12472000 | 12543332.0 | 12593784.00 |
| 3431 | P06582 | Karapakam | 7866250 | 8243390.0 | 7543856.00 |
| 380 | P09981 | Velachery | 12350640 | 13170537.0 | 12647901.20 |
| 6903 | P01512 | T Nagar | 17414970 | 15758530.0 | 17159051.00 |
| 3197 | P06601 | Karapakam | 8372750 | 8335412.0 | 7981913.25 |
| 3822 | P05119 | KK Nagar | 16814260 | 17245898.0 | 17006137.40 |
| 2310 | P05308 | Karapakam | 8008250 | 7777017.0 | 7953017.50 |
| 6044 | P09738 | Velachery | 14602040 | 14162924.0 | 15240464.80 |
| 4538 | P06888 | Anna Nagar | 21008600 | 20199684.0 | 20767024.00 |
| 4836 | P05101 | Velachery | 13160490 | 13836980.0 | 13360752.70 |
| 3718 | P04468 | Karapakam | 7689500 | 6666313.0 | 6443398.75 |
| 3317 | P00958 | Chrompet | 10847700 | 10876618.0 | 10985148.90 |
| 2885 | P01370 | Chrompet | 9190000 | 9059926.0 | 8989433.50 |
| 1022 | P03690 | Karapakam | 9888000 | 10544028.0 | 10901647.65 |
| 5938 | P03472 | Chrompet | 8905710 | 8451058.0 | 8217775.15 |
| 5243 | P01754 | Karapakam | 11837500 | 11679182.0 | 11408986.60 |
| 5758 | P06930 | KK Nagar | 8545660 | 9173977.0 | 8930851.40 |
| 2283 | P09776 | Velachery | 15941330 | 16022495.0 | 16274440.30 |
fig = px.scatter(result,
x='Price_actual',
y='Price_pred (XGB)',
trendline='ols',
color = 'AREA',
template='plotly_dark',
title='<b> Actual Price Vs Predicted Price with XGBRegressor')
fig.show()
fig = px.scatter(result,
x='Price_actual',
y='Price_pred (ETR)',
trendline='ols',
color = 'AREA',
template='plotly_dark',
title='<b> Actual Price Vs Predicted Price with ExtraTreesRegressor')
fig.show()
pyo.plot(fig, filename = 'Actual Price Vs Predicted Price.html', auto_open = False)
'Actual Price Vs Predicted Price.html'